########################################################
## This code defines the function that performs parameter
## tuning for our three machine learning algorithms
########################################################

library(haven)
library(caret)
library(randomForest)
library(doParallel)
library(mice)
library(plyr)
library(dplyr)

## Parameter tuning:

tuning <- function(data, 
                   dependent, 
                   s_tuning = 0.1, 
                   firstvar = "Gender", 
                   seed = 2111,
                   noisily = FALSE,
                   mtry = c(10, 20, 30, 40, 50),
                   min.node.size = c(1, 3, 5, 7, 9, 12, 14),
                   eta = 10^(seq(-4, log10(0.9), length.out = 50)),
                   nrounds = c(250),
                   lambda = 10^(seq(-4, -1, length.out = 50)),
                   folds = 3
                   ) {
  
  start_time <- Sys.time()
  
  ## Set seed at the beginning every time:
  set.seed(seed, "L'Ecuyer-CMRG")
  
   
  ########################################################
  ## Define the sub-sample for creation of predictions
  ########################################################
  
  n_parametertuning <- round(nrow(data)*s_tuning, digits=0) ## Using 10% of the sample for the tuning of the parameters
  
  first_column <- which(colnames(data)==firstvar) ## Identifying column number where the variables of interest start
  
  ## Keeping only those observations with non-missing outcome variables
  y_column <- which(colnames(data) == dependent) ## Column number with the dependent variable of interest
  data <- data[complete.cases(data[, y_column:y_column]), ] ## Restrict the dataset to observations with non-missing dependent variable
  
  ## Creating dataset for parameter tuning
  parametertuning <- data[data$n_order <= n_parametertuning,] ## Selects the first 10% observations of the whole data
  parametertuning_y <- factor(parametertuning[[dependent]])
  parametertuning_x <- parametertuning[, first_column:ncol(parametertuning)] ## Keeping all the possible covariates we want to have in the model
  parametertuning_final <- as.data.frame(cbind(parametertuning_y, parametertuning_x)) ## Creating final dataset used for parameter tuning
  
  ## Correcting format of outcome variables
  levels(parametertuning_final$parametertuning_y) <- c("no", "yes")

  ########################################################
  ## Feeding the model with parameter combinations
  ########################################################
  ## Creating the different parameter combinations to be fed into each model
  ## Example: for the Random Forest we have 4 different values of mtry and min node size. So there are 16 possible parameter combinations
  
  rfgrid <- expand.grid(
    mtry = mtry,
    splitrule=c("gini"),
    min.node.size= min.node.size)
  
  boostgrid <- expand.grid(
    nrounds = nrounds,
    max_depth = 6,
    eta = eta,
    gamma = 0,
    colsample_bytree = 1,
    min_child_weight = 1,
    subsample = 0.5
    )
  
  lassogrid <- expand.grid(
    alpha = c(1),
    lambda = lambda)
  
  ## Telling the model that we want to use 3 fold Cross Validation 
  train_control <- trainControl(method = "cv", number = folds, savePredictions = TRUE, allowParallel = T, classProbs = TRUE, summaryFunction = twoClassSummary)
  
  ########################################################
  ## Parameter tuning
  ########################################################
  
  ## Random forest
  if (noisily) {
    print(paste0("Tuning Random Forest, ", Sys.time()))
  }
  
  rrf <- train(parametertuning_y~., data = parametertuning_final, method = "ranger", importance = 'impurity',
               tuneGrid = rfgrid, num.trees = 250, sample.fraction = 0.5, replace = TRUE, metric = "ROC",
               trControl=train_control, maximize = TRUE, na.action = na.omit, verbose = noisily)
  
  ## Boosted Gradient
  if (noisily) {
    print(paste0("Tuning Boosted Gradient, ", Sys.time()))
  }

  rboost <- train(parametertuning_y~., data = parametertuning_final, method = "xgbTree", 
                  tuneGrid = boostgrid, metric = "ROC",
                  trControl=train_control, maximize = TRUE, na.action = na.omit, verbose = noisily)
  
  ## LASSO
  if (noisily) {
    print(paste0("Tuning Lasso, ", Sys.time()))
  }
  rlasso <- train(parametertuning_y~., data = parametertuning_final, method = "glmnet", 
                  tuneGrid = lassogrid, metric = "ROC",
                  trControl=train_control, maximize = TRUE, na.action = na.omit, verbose = noisily)
  

  ########################################################
  ## Save optimal parameters
  ########################################################
  
  ## Random forest
  rfgrid_final <- expand.grid(
    mtry = c(rrf$bestTune$mtry),
    splitrule = c("gini"),
    min.node.size = c(rrf$bestTune$min.node.size))
  
  boostgrid_final <- expand.grid(
    max_depth = 6,
    nrounds = c(rboost$bestTune$nrounds),
    eta = c(rboost$bestTune$eta),
    gamma = 0,
    colsample_bytree = 1,
    min_child_weight = 1,
    subsample = 0.5
  )
  
  ## LASSO
  lassogrid_final <- expand.grid(
    alpha = c(1),
    lambda = c(rlasso$bestTune$lambda))
  

  ########################################################
  ## Return results
  ########################################################
  return(list(rfgrid_final = rfgrid_final, 
              boostgrid_final = boostgrid_final, 
              lassogrid_final = lassogrid_final,
              rfgrid_search = rrf$results,
              boostgrid_search = rboost$results,
              lassogrid_search = rlasso$results
              ))
      
  if (noisily) {
    print(paste0("Tuning time: ", difftime(Sys.time(), start_time, units = c("mins")), "m"))
  }
}

## Here follows a simpler function that only returns the parameters (this is useful
## sometimes when we want to tune the model on a particular sample, but we
## don't want to use the usual sample split procedure):

tuning.int <- function(data, 
                       dependent, 
                       seed = NULL,
                       noisily = FALSE, 
                       mtry = c(10, 20, 30, 40, 50),
                       min.node.size = c(1, 3, 5, 7, 9, 12, 14),
                       eta = 10^(seq(-4, -0.04575749, length.out = 50)),
                       nrounds = c(250),
                       lambda = 10^(seq(-4, -1, length.out = 50))
                       ) {
  
  start_time <- Sys.time()
  
  ## Set seed at the beginning every time:
  if (!is.null(seed)) { 
    set.seed(seed, "L'Ecuyer-CMRG")
  }
  
  ########################################################
  ## Feeding the model with parameter combinations
  ########################################################
  ## Creating the different parameter combinations to be fed into each model
  ## Example: for the Random Forest we have 4 different values of mtry and min node size. So there are 16 possible parameter combinations
  
  rfgrid <- expand.grid(
    mtry = mtry,
    splitrule=c("gini"),
    min.node.size= min.node.size)
  
  boostgrid <- expand.grid(
    nrounds = nrounds,
    max_depth = 6,
    eta = eta,
    gamma = 0,
    colsample_bytree = 1,
    min_child_weight = 1,
    subsample = 0.5
  )
  
  lassogrid <- expand.grid(
    alpha = c(1),
    lambda = lambda)
  
  ## Telling the model that we want to use 3 fold Cross Validation 
  train_control <- trainControl(method = "cv", 
                                number = 3, 
                                savePredictions = TRUE, 
                                allowParallel = T, 
                                classProbs = TRUE, 
                                summaryFunction = twoClassSummary)
  
  ########################################################
  ## Parameter tuning
  ########################################################
  
  ## Random forest
  if (noisily) {
    print(paste0("----> Tuning Random Forest, ", Sys.time()))
  }
  
  rrf <- train(as.formula(paste0(dependent, " ~ .")), 
               data = data, 
               method = "ranger", 
               importance = 'impurity',
               tuneGrid = rfgrid, 
               num.trees = 250, 
               sample.fraction = 0.5, 
               replace = TRUE, 
               metric = "ROC",
               trControl = train_control, 
               maximize = TRUE, 
               na.action = na.omit)
  
  ## Boosted Gradient
  if (noisily) {
    print(paste0("----> Tuning Boosted Gradient, ", Sys.time()))
  }
  rboost <- train(as.formula(paste0(dependent, " ~ .")), 
                  data = data, 
                  method = "xgbTree", 
                  tuneGrid = boostgrid, 
                  metric = "ROC", 
                  trControl = train_control, 
                  maximize = TRUE, 
                  na.action = na.omit)

  ## LASSO
  if (noisily) {
    print(paste0("----> Tuning Lasso, ", Sys.time()))
  }
  rlasso <- train(as.formula(paste0(dependent, " ~ .")), 
                  data = data, 
                  method = "glmnet", 
                  tuneGrid = lassogrid, 
                  metric = "ROC",
                  trControl = train_control, 
                  maximize = TRUE, 
                  na.action = na.omit)
  
  
  ########################################################
  ## Save parameters
  ########################################################
  
  ## Random forest
  rfgrid_final <- expand.grid(mtry = c(rrf$bestTune$mtry),
                              splitrule = c("gini"),
                              min.node.size = c(rrf$bestTune$min.node.size))
  
  ## Boosted Gradient
  boostgrid_final <- expand.grid(
    max_depth = 6,
    nrounds = c(rboost$bestTune$nrounds),
    eta = c(rboost$bestTune$eta),
    gamma = 0,
    colsample_bytree = 1,
    min_child_weight = 1,
    subsample = 0.5
  )
  
  ## LASSO
  lassogrid_final <- expand.grid(alpha = c(1),
                                 lambda = c(rlasso$bestTune$lambda))
  
  
  ########################################################
  ## Return results
  ########################################################
  return(list(rfgrid_final = rfgrid_final, 
              boostgrid_final = boostgrid_final, 
              lassogrid_final = lassogrid_final))
  
  if (noisily) {
    print(paste0("----> Tuning time:", difftime(Sys.time(), start_time, units = c("mins")), "m"))
  }
}